{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-1-a64b87715492>:28: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "from time import sleep\n",
    "import requests\n",
    "import time\n",
    "from bs4 import BeautifulSoup\n",
    "import pytesseract\n",
    "from PIL import Image\n",
    "from datetime import datetime\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://cnki.net\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'中山大学南...'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sleep(5)\n",
    "element = driver.find_element_by_id('Ecp_loginShowName1')\n",
    "element.get_attribute('innerHTML')#检查有无连上校园网"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_id('highSearch').click()\n",
    "#driver.find_element_by_xpath('//*[@class=\"info_list username\"]').get_attributeatt('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-6E7C1C9175B8B9601AB836744E4E9F4D'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-6E7C1C9175B8B9601AB836744E4E9F4D',\n",
       " 'CDwindow-AE36C09C029B08CB638B58CC0695A0A5']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-7-fc7a71ef64f3>:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])#切换至第二窗口\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[1])#切换至第二窗口"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div/ul[1]/li[1]/a')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()\n",
    "payload =  {\"主题\": \"新媒体\"}\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/dl/dd[1]/div[2]/input').send_keys(payload['主题'])\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[2]/input').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>高校校园媒体融合发展机遇、现状与对策</td>\n",
       "      <td>穆冠成; 李秀芹; 崔文斐</td>\n",
       "      <td>山东理工大学学报(社会科学版)</td>\n",
       "      <td>2021-07-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>新媒体语境下基层党建宣传工作的思考</td>\n",
       "      <td>王瑞芳</td>\n",
       "      <td>中小企业管理与科技(上旬刊)</td>\n",
       "      <td>2021-07-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>运用新媒体助力非公企业党建工作发展办法研究</td>\n",
       "      <td>周全</td>\n",
       "      <td>中小企业管理与科技(上旬刊)</td>\n",
       "      <td>2021-07-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>做规划时 如何才不被动</td>\n",
       "      <td>唐安妮</td>\n",
       "      <td>成才与就业</td>\n",
       "      <td>2021-07-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>乡村振兴视阈下郢城泥陶传承与发展策略研究</td>\n",
       "      <td>王华龙; 桑俊; 马盈颖</td>\n",
       "      <td>今古文创</td>\n",
       "      <td>2021-07-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>新时代高职数学教学改革策略研究</td>\n",
       "      <td>王刚</td>\n",
       "      <td>青岛远洋船员职业学院学报</td>\n",
       "      <td>2021-06-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>立德树人视域下高校党建品牌打造与构建大思政育人格局路径探析</td>\n",
       "      <td>王聪</td>\n",
       "      <td>佳木斯职业学院学报</td>\n",
       "      <td>2021-06-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>疫情防控背景下高校校园新媒体思政功能的激活与运用</td>\n",
       "      <td>俞珂瑶; 马向东</td>\n",
       "      <td>佳木斯职业学院学报</td>\n",
       "      <td>2021-06-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>新媒体时代高职英语教育教学创新探索</td>\n",
       "      <td>吴欣欣</td>\n",
       "      <td>佳木斯职业学院学报</td>\n",
       "      <td>2021-06-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>基于全媒体时代下的高校思想政治工作创新研究</td>\n",
       "      <td>董亚君; 徐东升; 张晋</td>\n",
       "      <td>佳木斯职业学院学报</td>\n",
       "      <td>2021-06-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>推动学生思想政治教育工作的融媒体研究——以多元智能视域下高校新媒体公众号为例</td>\n",
       "      <td>林婷婷; 欧阳超群</td>\n",
       "      <td>佳木斯职业学院学报</td>\n",
       "      <td>2021-06-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>《天津建设科技》编辑部</td>\n",
       "      <td>NaN</td>\n",
       "      <td>天津建设科技</td>\n",
       "      <td>2021-06-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>新媒体文学批评在“超真实”世界中的时代变构</td>\n",
       "      <td>史丽娜</td>\n",
       "      <td>当代文坛</td>\n",
       "      <td>2021-06-29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>北京市医院短视频平台传播力现状及展望</td>\n",
       "      <td>韩扬阳; 李与涵; 高天; 郭蕊</td>\n",
       "      <td>中国医院</td>\n",
       "      <td>2021-06-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>大学生党员教育管理长效机制的路径选择</td>\n",
       "      <td>李萌; 张一斐; 贾华</td>\n",
       "      <td>人才资源开发</td>\n",
       "      <td>2021-06-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>“互联网+”背景下面向知识服务模式的科技期刊编辑职业规划研究——以《江苏农业学报》编辑部为例</td>\n",
       "      <td>徐艳; 蒋永忠</td>\n",
       "      <td>农业科技管理</td>\n",
       "      <td>2021-06-26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>新媒体视野下常德丝弦的创新与传播研究  网络首发</td>\n",
       "      <td>田正铁</td>\n",
       "      <td>四川戏剧</td>\n",
       "      <td>2021-06-25 14:42</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>新媒体视野下常德丝弦的创新与传播研究</td>\n",
       "      <td>田正铁</td>\n",
       "      <td>四川戏剧</td>\n",
       "      <td>2021-06-25 14:42</td>\n",
       "      <td>NaN</td>\n",
       "      <td>116.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>社会治理视域下网络统战的内涵特质、价值取向与实践路径</td>\n",
       "      <td>史亚博</td>\n",
       "      <td>上海市社会主义学院学报</td>\n",
       "      <td>2021-06-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>新冠肺炎疫情下甘孜州全域旅游经济问题与对策</td>\n",
       "      <td>邹鹏</td>\n",
       "      <td>经济研究导刊</td>\n",
       "      <td>2021-06-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                              篇名  \\\n",
       "0            1                              高校校园媒体融合发展机遇、现状与对策   \n",
       "1            2                               新媒体语境下基层党建宣传工作的思考   \n",
       "2            3                           运用新媒体助力非公企业党建工作发展办法研究   \n",
       "3            4                                     做规划时 如何才不被动   \n",
       "4            5                            乡村振兴视阈下郢城泥陶传承与发展策略研究   \n",
       "5            6                                 新时代高职数学教学改革策略研究   \n",
       "6            7                   立德树人视域下高校党建品牌打造与构建大思政育人格局路径探析   \n",
       "7            8                        疫情防控背景下高校校园新媒体思政功能的激活与运用   \n",
       "8            9                               新媒体时代高职英语教育教学创新探索   \n",
       "9           10                           基于全媒体时代下的高校思想政治工作创新研究   \n",
       "10          11          推动学生思想政治教育工作的融媒体研究——以多元智能视域下高校新媒体公众号为例   \n",
       "11          12                                     《天津建设科技》编辑部   \n",
       "12          13                           新媒体文学批评在“超真实”世界中的时代变构   \n",
       "13          14                              北京市医院短视频平台传播力现状及展望   \n",
       "14          15                              大学生党员教育管理长效机制的路径选择   \n",
       "15          16  “互联网+”背景下面向知识服务模式的科技期刊编辑职业规划研究——以《江苏农业学报》编辑部为例   \n",
       "16          17                        新媒体视野下常德丝弦的创新与传播研究  网络首发   \n",
       "17          18                              新媒体视野下常德丝弦的创新与传播研究   \n",
       "18          19                      社会治理视域下网络统战的内涵特质、价值取向与实践路径   \n",
       "19          20                           新冠肺炎疫情下甘孜州全域旅游经济问题与对策   \n",
       "\n",
       "                  作者               刊名              发表时间  被引     下载  操作  \n",
       "0      穆冠成; 李秀芹; 崔文斐  山东理工大学学报(社会科学版)        2021-07-01 NaN    NaN  下载  \n",
       "1                王瑞芳   中小企业管理与科技(上旬刊)        2021-07-01 NaN    NaN  下载  \n",
       "2                 周全   中小企业管理与科技(上旬刊)        2021-07-01 NaN    NaN  下载  \n",
       "3                唐安妮            成才与就业        2021-07-01 NaN    NaN  下载  \n",
       "4       王华龙; 桑俊; 马盈颖             今古文创        2021-07-01 NaN    NaN  下载  \n",
       "5                 王刚     青岛远洋船员职业学院学报        2021-06-30 NaN    NaN  下载  \n",
       "6                 王聪        佳木斯职业学院学报        2021-06-30 NaN    NaN  下载  \n",
       "7           俞珂瑶; 马向东        佳木斯职业学院学报        2021-06-30 NaN    NaN  下载  \n",
       "8                吴欣欣        佳木斯职业学院学报        2021-06-30 NaN    NaN  下载  \n",
       "9       董亚君; 徐东升; 张晋        佳木斯职业学院学报        2021-06-30 NaN    NaN  下载  \n",
       "10         林婷婷; 欧阳超群        佳木斯职业学院学报        2021-06-30 NaN    NaN  下载  \n",
       "11               NaN           天津建设科技        2021-06-30 NaN    NaN  下载  \n",
       "12               史丽娜             当代文坛        2021-06-29 NaN    NaN  下载  \n",
       "13  韩扬阳; 李与涵; 高天; 郭蕊             中国医院        2021-06-28 NaN   32.0  下载  \n",
       "14       李萌; 张一斐; 贾华           人才资源开发        2021-06-28 NaN    4.0  下载  \n",
       "15           徐艳; 蒋永忠           农业科技管理        2021-06-26 NaN    NaN  下载  \n",
       "16               田正铁             四川戏剧  2021-06-25 14:42 NaN  102.0  下载  \n",
       "17               田正铁             四川戏剧  2021-06-25 14:42 NaN  116.0  下载  \n",
       "18               史亚博      上海市社会主义学院学报        2021-06-25 NaN   19.0  下载  \n",
       "19                邹鹏           经济研究导刊        2021-06-25 NaN    8.0  下载  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sleep(3)\n",
    "element = driver.find_element_by_xpath('//*[@id=\"perPageDiv\"]/div')\n",
    "driver.execute_script(\"arguments[0].click();\", element)\n",
    "element = driver.find_element_by_xpath('//*[@id=\"perPageDiv\"]/ul/li[3]')\n",
    "driver.execute_script(\"arguments[0].click();\", element)#当前页面展示50篇文章\n",
    "element = driver.find_element_by_id('gridTable')\n",
    "page_html = element.get_attribute('innerHTML')\n",
    "data = pd.read_html(page_html)[0]\n",
    "data#展示当前页面文章列表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-10-be5f2638ef17>:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "120"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[1])\n",
    "element = driver.find_element_by_xpath('//span[@class=\"total\"]')\n",
    "max_page=int(element.get_attribute(\"textContent\").replace(\"共\",\"\").replace(\"页\",\"\"))\n",
    "max_page#抓取总页数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-12-8af46565a00e>:5: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n",
      "<ipython-input-12-8af46565a00e>:14: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])#跳转详情界面\n",
      "<ipython-input-12-8af46565a00e>:17: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-12-8af46565a00e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      9\u001b[0m     \u001b[0melement\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'//*[@id=\"pdfDown\"]'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     10\u001b[0m     \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute_script\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"arguments[0].click();\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0melement\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 11\u001b[1;33m     \u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m6\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     12\u001b[0m     \u001b[0mnums\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlistdir\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     13\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mnums\u001b[0m\u001b[1;33m>\u001b[0m\u001b[0mnum\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "for a in range(1,max_page):\n",
    "    allarticle = driver.find_elements_by_xpath('//td[@class=\"name\"]//a')\n",
    "    for x in allarticle:\n",
    "        x.click()\n",
    "        sleep(5)\n",
    "        driver.switch_to_window(driver.window_handles[2])\n",
    "        path = r\"C:\\Users\\待冠名\\Downloads\"      # 输入文件夹地址\n",
    "        files = os.listdir(path)   # 读入文件夹\n",
    "        num = len(files) \n",
    "        element=driver.find_element_by_xpath('//*[@id=\"pdfDown\"]')\n",
    "        driver.execute_script(\"arguments[0].click();\", element)\n",
    "        sleep(6)\n",
    "        nums= len(os.listdir(path))\n",
    "        if nums>num:\n",
    "            driver.switch_to_window(driver.window_handles[2])#跳转详情界面\n",
    "            driver.close()\n",
    "            sleep(1)\n",
    "            driver.switch_to_window(driver.window_handles[1])\n",
    "            #无文件新增则进行验证码页面的判定\n",
    "        else:\n",
    "            #出现验证码\n",
    "            driver.switch_to_window(driver.window_handles[-1])#跳转到最后一个页面\n",
    "            try:\n",
    "                driver.find_element_by_id('vImg')\n",
    "                for i in range(0,5000):\n",
    "                    element = driver.find_element_by_xpath('//*[@id=\"vImg\"]')\n",
    "                    element.click()\n",
    "                    driver.find_element_by_id('vcode').clear()\n",
    "                    filename = \"./image/imagedown.png\"\n",
    "                    driver.save_screenshot(filename)#截屏\n",
    "                    element = driver.find_element_by_id('vImg')#需要截取的元素定位                                        \n",
    "                    left = element.location['x']+60#获取元素上下左右的位置\n",
    "                    top = element.location['y']+70\n",
    "                    right = element.location['x'] + element.size['width']+150\n",
    "                    bottom = element.location['y'] + element.size['height']+70\n",
    "                    im = Image.open(filename)#打开刚才的截图\n",
    "                    im = im.crop((left, top, right, bottom))#截取对应位置\n",
    "                    im.save(filename)#保存覆盖原有截图\n",
    "                    #识别出验证码\n",
    "                    code = pytesseract.image_to_string(Image.open('./image/imagedown.png'),lang='eng')\n",
    "                    print(code)\n",
    "                    driver.find_element_by_id('vcode').send_keys(code)\n",
    "                    driver.find_element_by_xpath('/html/body/div/form/dl/dd/button').click()\n",
    "                    try:\n",
    "                        driver.find_element_by_xpath ('//*[@id=\"vImg\"]')\n",
    "                    except:\n",
    "                        driver.switch_to_window(driver.window_handles[2])\n",
    "                        sleep(2)\n",
    "                        driver.close()\n",
    "                        driver.switch_to_window(driver.window_handles[1])\n",
    "                        break\n",
    "            except:\n",
    "                driver.switch_to_window(driver.window_handles[2])#跳转详情界面\n",
    "                driver.close()#关闭详情页面\n",
    "                sleep(2)                    \n",
    "                driver.switch_to_window(driver.window_handles[1])#回到链接页面\n",
    "    element=driver.find_element_by_xpath('//*[@id=\"PageNext\"]')#下载完该页面所有pdf后翻页\n",
    "    element.click()\n",
    "    sleep(2)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "driver.switch_to_window(driver.window_handles[1])#返回列表页\n",
    "driver.find_element_by_id('page1')#回到第一页\n",
    "for i in range(0,10):\n",
    "    driver.find_element_by_id('selectCheckAll1').click()\n",
    "    element = driver.find_element_by_id('PageNext')\n",
    "    element.click()#选中500篇\n",
    "    sleep(5)\n",
    "driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()\n",
    "driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()\n",
    "driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()\n",
    "driver.window_handles\n",
    "driver.switch_to_window(driver.window_handles[2])\n",
    "driver.find_element_by_xpath('//*[@id=\"litotxt\"]').click()        #导出refworks准备可视化\n",
    "driver.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def changevercode():\n",
    "    #pytesseract截图识别翻页验证码\n",
    "    driver.execute_script('window.scrollBy(0,-500)')#页面往上滚动500px\n",
    "    for i in range(0,500):\n",
    "        driver.find_element_by_id('changeVercode').click()\n",
    "        driver.find_element_by_id('vericode').clear()\n",
    "        filename = \"./image/image1.png\"\n",
    "        driver.save_screenshot(filename)#截屏\n",
    "        element = driver.find_element_by_xpath('//*[@id=\"changeVercode\"]')#需要截取的元素定位                                        \n",
    "        left = element.location['x']+195#获取元素上下左右的位置\n",
    "        top = element.location['y']-523\n",
    "        right = element.location['x'] + element.size['width']+215\n",
    "        bottom = element.location['y'] + element.size['height']-423\n",
    "        im = Image.open(filename)#打开刚才的截图\n",
    "        im = im.crop((left, top, right, bottom))#截取对应位置\n",
    "        im.save(filename)#保存覆盖原有截图\n",
    "        #识别出验证码\n",
    "        code = pytesseract.image_to_string(Image.open('./image/image1.png'),lang='eng')\n",
    "        print(code)\n",
    "        driver.find_element_by_xpath('//*[@id=\"vericode\"]').send_keys(code)\n",
    "        driver.find_element_by_id('checkCodeBtn').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for a in range(1,max_page):\n",
    "    allarticle = driver.find_elements_by_xpath('//td[@class=\"name\"]//a')\n",
    "    for x in allarticle:\n",
    "        driver.switch_to_window(driver.window_handles[1])#回到列表页\n",
    "        allarticle = driver.find_elements_by_xpath('//td[@class=\"name\"]//a')\n",
    "        x.click()\n",
    "        sleep(5)\n",
    "        driver.switch_to_window(driver.window_handles[2])\n",
    "        element=driver.find_element_by_xpath('//*[@id=\"pdfDown\"]')\n",
    "        driver.execute_script(\"arguments[0].click();\", element)\n",
    "        driver.close()###关闭详情页面                    \n",
    "        driver.switch_to_window(driver.window_handles[1])###回到链接页面\n",
    "    element=driver.find_element_by_xpath('//*[@id=\"PageNext\"]')#下载完该页面所有pdf后翻页\n",
    "    element.click()\n",
    "    sleep(3)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
